// LEC March 26, 2021

// Component for data_prep.do
// Ethnic var procedure for CShapes data

////////////////////////////////////////////////////////////////////////////////// 
// Creates ethnicvars file which holds ethnic properties of sov. states including dep.
//////////////////////////////////////////////////////////////////////////////////

cd $INDIR
insheet using $SEGMENTFILE, clear

// Here is where EPR AG groups could be included
// cd "/Users/larsc/polybox/Shared/NASTACdb/ethnicity-698/CShapes" 
// insheet using "CShapesGeoEPRAggSegments.csv", clear

replace holder = id if holder == .  // WE NEED THIS LINE FOR THE CHSAPESSOV DATA WHICH DO NOT HAVE HOLDER


// Metropolitan id which is only valid for core of empires and sov. states
gen mid = .
replace mid = id if id==holder 

egen ide = group(ethnicid)
gen long segid = .
replace segid = id*10000 + ide
// Compute various ethnic vars for each state
bys holder year: egen ngroups = count(ethnicid)
bys holder year: egen sumpop = sum(ethnicpopulationcorr)
bys id year: egen sumallpop = sum(ethnicpopulationcorr)


drop if mindistancecapital == .
drop if ethnicpopulationcorr == .

gen segcap = 0
replace segcap = 1 if mindistancecapital < 0.1

bys id year: egen nsegcap = sum(segcap)

bys id year: egen mindistcap = min(mindistancecapital)
replace segcap = 1 if nsegcap==0 & mindistcap==mindistancecapital & mindistancecapital != .

drop nsegcap
bys id year: egen nsegcap = sum(segcap)

bys id year: egen maxsegcappop = max(segcap*ethnicpopulationcorr)
replace segcap = 0 if nsegcap>1
replace segcap = 1 if nsegcap>1 & maxsegcappop == ethnicpopulationcorr 

drop nsegcap
bys id year: egen nsegcap = sum(segcap)

bys mid year: egen segcapsize = max(segcap*ethnicpopulationcorr)

// maxeth is the size of the largest ethnic group segment in the metropolitan part of the state
// NOTE: This means "English" for the UK (largest group within the British Isles rather than in the empire)
bys mid year: egen maxeth = max(ethnicpopulationcorr)
gen maxethflag = 0
replace maxethflag = 1 if ethnicpopulationcorr==maxeth & ethnicpopulationcorr != .

replace maxethflag = segcap if $CAPITAL
replace maxeth = segcapsize if $CAPITAL

// Compute two main ethnic indicators;
// maxshare: share of larest group
gen maxshare = maxeth/sumpop
// KEY VARIABLE: ef: ethnic fractionalization based on population shares
bys holder year: egen sumsq = sum((ethnicpopulationcorr/sumpop)^2)
gen ef = 1 - sumsq  // note there is no default value if sumsq does not execute

// Compute properties of the Aggregate Group (AG)
// number of AG segments, sum of AG population, and largest AG segment
bys ethnicid year: egen nag = count(ethnicid)
bys ethnicid year: egen sumagpop = sum(ethnicpopulationcorr)
bys ethnicid year: egen maxag = max(ethnicpopulationcorr)
// territorial frac. of AG (based on population shares)
bys ethnicid year: egen sumsqag = sum((ethnicpopulationcorr/sumagpop)^2)
gen agtf = 1 - sumsqag


bys ethnicid: egen minagyear = min(year)
xtset segid year
gen minagtf = agtf if year==minagyear
bys segid (year): replace minagtf = min(agtf,l.minagtf)

gen agtfincr = 0
replace agtfincr = agtf - minagtf if agtf - minagtf > 0

// Compute AG properties relating to AG of the largest group: maxethid
// number of segments
bys mid year: egen agn = max(maxethflag*nag)
// overall AG population
bys mid year: egen agpop = max(maxethflag*sumagpop)
// population share of AG linked to largest group
bys mid year: egen agmax = max(maxethflag*maxshare)
// KEY VARIABLE: AG terr. frac. related to AG of state's largest group segment
bys mid year: egen agfrac = max(maxethflag*agtf)
bys mid year: egen agfracincr = max(maxethflag*agtfincr)
// ID code of the AG linked to the state's largest group segment
bys mid year: egen maxethid = max(maxethflag*ethnicid)
// share of state's largest group segment compared to AG population
bys mid year: egen maxagshare = max(maxethflag*maxeth/sumagpop)
// name of the AG group linked to largest group segment
gen maxethname = ethnicname if maxethid==ethnicid

// sum share of all AG spillover
bys id year: egen sumcntragpop =sum(sumagpop)
gen sumagshare = sumallpop/sumcntragpop

// Save AG data to be used by AG based routine
cd $OUTDIR
save agdata, replace

// From now on, drop all dependencies from empires from data while retaining mid-based vars
// The goal is to report on data for sovereign polities only
drop if holder != id

save segvars, replace

// drop holder ID variable
drop holder

gsort id year -ethnicpopulationcorr

// Now sum up all segment data for each state
collapse (firstnm)name maxethname (max)ngroups (max)sumpop (max)maxshare ///
 (max)maxeth (max)ef (max)agn (max)agpop (max)maxagshare (max)agfrac agfracincr (max)maxethid sumagshare, by (id year)
sort id year

cd $OUTDIR
save ethnicvars, replace


///// Compute AG frac taking into account /////////////////////
use segvars, clear

collapse (firstnm)ethnicname (max)maxethflag (sum)ethnicpopulationcorr ethnicareacorr, by (ethnicid holder year)



bys ethnicid year: egen sumagpop = sum(ethnicpopulationcorr)
gen s = ethnicpopulationcorr / sumagpop

bys ethnicid year: egen sumagsq = sum(s^2)
gen tfagpop = 1 - sumagsq

bys holder year: egen agfracpop = max(maxethflag*tfagpop)

bys ethnicid year: egen sumagarea = sum(ethnicareacorr)

gen areaXtfagpop = sumagarea*tfagpop
gen popXtfagpop = sumagpop*tfagpop

collapse (mean)tfagpop (sum)sumagarea sumagpop areaXtfagpop popXtfagpop, by(holder year)
save ethnicvars3, replace

collapse (mean)tfagpop (sum)sumagarea sumagpop areaXtfagpop popXtfagpop, by (year)

gen tfagpopaw = areaXtfagpop / sumagarea
gen tfagpoppw = popXtfagpop / sumagpop

save ethnicvars2, replace


//tset year
// tsline tfagpop tfagpopaw
